{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example usage of the ArXiv project source"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import sys\n",
    "if '..' not in sys.path:\n",
    "    sys.path.append('..')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from arxiv_analysis import load\n",
    "arxiv = load.ArxivDataset.load(\n",
    "    loc='../data/example/arxiv_meta.jsonl'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>submitter</th>\n",
       "      <th>authors</th>\n",
       "      <th>title</th>\n",
       "      <th>comments</th>\n",
       "      <th>journal-ref</th>\n",
       "      <th>categories</th>\n",
       "      <th>abstract</th>\n",
       "      <th>update_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(Pavel, Nadolsky)</td>\n",
       "      <td>(C., Bal\\'azs, ,, E., L., Berger, ,, P., M., N...</td>\n",
       "      <td>(Calculation, of, prompt, diphoton, production...</td>\n",
       "      <td>(37, pages, ,, 15, figures, ;, published, vers...</td>\n",
       "      <td>(Phys, ., Rev, ., D76:013009,2007)</td>\n",
       "      <td>[5]</td>\n",
       "      <td>(A, fully, differential, calculation, in, pert...</td>\n",
       "      <td>2008-11-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(Louis, Theran)</td>\n",
       "      <td>(Ileana, Streinu, and, Louis, Theran)</td>\n",
       "      <td>(Sparsity, -, certifying, Graph, Decompositions)</td>\n",
       "      <td>(To, appear, in, Graphs, and, Combinatorics)</td>\n",
       "      <td>None</td>\n",
       "      <td>[7, 3]</td>\n",
       "      <td>(We, describe, a, new, algorithm, ,, the, $, (...</td>\n",
       "      <td>2008-12-13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(Hongjun, Pan)</td>\n",
       "      <td>(Hongjun, Pan)</td>\n",
       "      <td>(The, evolution, of, the, Earth, -, Moon, syst...</td>\n",
       "      <td>(23, pages, ,, 3, figures)</td>\n",
       "      <td>None</td>\n",
       "      <td>[9]</td>\n",
       "      <td>(The, evolution, of, Earth, -, Moon, system, i...</td>\n",
       "      <td>2008-01-13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(David, Callan)</td>\n",
       "      <td>(David, Callan)</td>\n",
       "      <td>(A, determinant, of, Stirling, cycle, numbers,...</td>\n",
       "      <td>(11, pages)</td>\n",
       "      <td>None</td>\n",
       "      <td>[7]</td>\n",
       "      <td>(We, show, that, a, determinant, of, Stirling,...</td>\n",
       "      <td>2007-05-23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(Alberto, Torchinsky)</td>\n",
       "      <td>(Wael, Abu, -, Shammala, and, Alberto, Torchin...</td>\n",
       "      <td>(From, dyadic, $, \\Lambda_{\\alpha}$, to, $, \\L...</td>\n",
       "      <td>None</td>\n",
       "      <td>(Illinois, J., Math, ., 52, (, 2008, ), no.2, ...</td>\n",
       "      <td>[6, 8]</td>\n",
       "      <td>(In, this, paper, we, show, how, to, compute, ...</td>\n",
       "      <td>2013-10-15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               submitter                                            authors  \\\n",
       "0      (Pavel, Nadolsky)  (C., Bal\\'azs, ,, E., L., Berger, ,, P., M., N...   \n",
       "1        (Louis, Theran)              (Ileana, Streinu, and, Louis, Theran)   \n",
       "2         (Hongjun, Pan)                                     (Hongjun, Pan)   \n",
       "3        (David, Callan)                                    (David, Callan)   \n",
       "4  (Alberto, Torchinsky)  (Wael, Abu, -, Shammala, and, Alberto, Torchin...   \n",
       "\n",
       "                                               title  \\\n",
       "0  (Calculation, of, prompt, diphoton, production...   \n",
       "1   (Sparsity, -, certifying, Graph, Decompositions)   \n",
       "2  (The, evolution, of, the, Earth, -, Moon, syst...   \n",
       "3  (A, determinant, of, Stirling, cycle, numbers,...   \n",
       "4  (From, dyadic, $, \\Lambda_{\\alpha}$, to, $, \\L...   \n",
       "\n",
       "                                            comments  \\\n",
       "0  (37, pages, ,, 15, figures, ;, published, vers...   \n",
       "1       (To, appear, in, Graphs, and, Combinatorics)   \n",
       "2                         (23, pages, ,, 3, figures)   \n",
       "3                                        (11, pages)   \n",
       "4                                               None   \n",
       "\n",
       "                                         journal-ref categories  \\\n",
       "0                 (Phys, ., Rev, ., D76:013009,2007)        [5]   \n",
       "1                                               None     [7, 3]   \n",
       "2                                               None        [9]   \n",
       "3                                               None        [7]   \n",
       "4  (Illinois, J., Math, ., 52, (, 2008, ), no.2, ...     [6, 8]   \n",
       "\n",
       "                                            abstract update_date  \n",
       "0  (A, fully, differential, calculation, in, pert...  2008-11-26  \n",
       "1  (We, describe, a, new, algorithm, ,, the, $, (...  2008-12-13  \n",
       "2  (The, evolution, of, Earth, -, Moon, system, i...  2008-01-13  \n",
       "3  (We, show, that, a, determinant, of, Stirling,...  2007-05-23  \n",
       "4  (In, this, paper, we, show, how, to, compute, ...  2013-10-15  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arxiv.df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>submitter</th>\n",
       "      <th>authors</th>\n",
       "      <th>title</th>\n",
       "      <th>comments</th>\n",
       "      <th>journal-ref</th>\n",
       "      <th>categories</th>\n",
       "      <th>abstract</th>\n",
       "      <th>update_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0704.0001</th>\n",
       "      <td>(Pavel, Nadolsky)</td>\n",
       "      <td>(C., Bal\\'azs, ,, E., L., Berger, ,, P., M., N...</td>\n",
       "      <td>(Calculation, of, prompt, diphoton, production...</td>\n",
       "      <td>(37, pages, ,, 15, figures, ;, published, vers...</td>\n",
       "      <td>(Phys, ., Rev, ., D76:013009,2007)</td>\n",
       "      <td>[hep-ph]</td>\n",
       "      <td>(A, fully, differential, calculation, in, pert...</td>\n",
       "      <td>2008-11-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0704.0002</th>\n",
       "      <td>(Louis, Theran)</td>\n",
       "      <td>(Ileana, Streinu, and, Louis, Theran)</td>\n",
       "      <td>(Sparsity, -, certifying, Graph, Decompositions)</td>\n",
       "      <td>(To, appear, in, Graphs, and, Combinatorics)</td>\n",
       "      <td>None</td>\n",
       "      <td>[math.co, cs.cg]</td>\n",
       "      <td>(We, describe, a, new, algorithm, ,, the, $, (...</td>\n",
       "      <td>2008-12-13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   submitter  \\\n",
       "0704.0001  (Pavel, Nadolsky)   \n",
       "0704.0002    (Louis, Theran)   \n",
       "\n",
       "                                                     authors  \\\n",
       "0704.0001  (C., Bal\\'azs, ,, E., L., Berger, ,, P., M., N...   \n",
       "0704.0002              (Ileana, Streinu, and, Louis, Theran)   \n",
       "\n",
       "                                                       title  \\\n",
       "0704.0001  (Calculation, of, prompt, diphoton, production...   \n",
       "0704.0002   (Sparsity, -, certifying, Graph, Decompositions)   \n",
       "\n",
       "                                                    comments  \\\n",
       "0704.0001  (37, pages, ,, 15, figures, ;, published, vers...   \n",
       "0704.0002       (To, appear, in, Graphs, and, Combinatorics)   \n",
       "\n",
       "                                  journal-ref        categories  \\\n",
       "0704.0001  (Phys, ., Rev, ., D76:013009,2007)          [hep-ph]   \n",
       "0704.0002                                None  [math.co, cs.cg]   \n",
       "\n",
       "                                                    abstract update_date  \n",
       "0704.0001  (A, fully, differential, calculation, in, pert...  2008-11-26  \n",
       "0704.0002  (We, describe, a, new, algorithm, ,, the, $, (...  2008-12-13  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arxiv[:2]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
